NASA RUL Prediction: EDA, Data Cleaning, Feature Engineering and Model Training¶
Importing packages and modules¶
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import pickle
import yaml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, r2_score
import plotly.io as pio
pio.renderers.default='notebook'
print(os.getcwd())
training = True
c:\Users\balaj\Desktop\Projects\Predictive_Maintenance-NASA_Turbofan_Jet_Engine\FD001 Analysis
# Importing and refreshing modules
import importlib
import preprocess
importlib.reload(preprocess)
from preprocess import Preprocessor, Charting
if training == True:
p = Preprocessor()
else:
# Load the model from the pickle file
with open('./saved_scalers/minmaxscaler.pkl', 'rb') as file:
saved_scaler = pickle.load(file)
p = Preprocessor(saved_scaler)
c = Charting()
# Loading data
train = pd.read_table("../Data/train_FD001.txt", sep=' ', header=None)
Cleaning and Preprocessing¶
# Renaming columns
train = p.rename_columns(train)
train.head()
| unit | cycles | ops1 | ops2 | ops3 | s1 | s2 | s3 | s4 | s5 | ... | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | s22 | s23 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | -0.0007 | -0.0004 | 100.0 | 518.67 | 641.82 | 1589.70 | 1400.60 | 14.62 | ... | 8138.62 | 8.4195 | 0.03 | 392 | 2388 | 100.0 | 39.06 | 23.4190 | NaN | NaN |
| 1 | 1 | 2 | 0.0019 | -0.0003 | 100.0 | 518.67 | 642.15 | 1591.82 | 1403.14 | 14.62 | ... | 8131.49 | 8.4318 | 0.03 | 392 | 2388 | 100.0 | 39.00 | 23.4236 | NaN | NaN |
| 2 | 1 | 3 | -0.0043 | 0.0003 | 100.0 | 518.67 | 642.35 | 1587.99 | 1404.20 | 14.62 | ... | 8133.23 | 8.4178 | 0.03 | 390 | 2388 | 100.0 | 38.95 | 23.3442 | NaN | NaN |
| 3 | 1 | 4 | 0.0007 | 0.0000 | 100.0 | 518.67 | 642.35 | 1582.79 | 1401.87 | 14.62 | ... | 8133.83 | 8.3682 | 0.03 | 392 | 2388 | 100.0 | 38.88 | 23.3739 | NaN | NaN |
| 4 | 1 | 5 | -0.0019 | -0.0002 | 100.0 | 518.67 | 642.37 | 1582.85 | 1406.22 | 14.62 | ... | 8133.80 | 8.4294 | 0.03 | 393 | 2388 | 100.0 | 38.90 | 23.4044 | NaN | NaN |
5 rows × 28 columns
# Dropping empty columns
train = train.dropna(axis=1, how='all')
list(train.isna().sum()) # Checking for missing values in each column
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
It is important to understand the source of our data here. The data is the result of a simulation and therefore outliers due to data entry errors or environmental factors are ruled out. Outlier Sensor values could hold key information about the health of the engine. Thus we will not remove them.
train.describe()
| unit | cycles | ops1 | ops2 | ops3 | s1 | s2 | s3 | s4 | s5 | ... | s12 | s13 | s14 | s15 | s16 | s17 | s18 | s19 | s20 | s21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 20631.000000 | 20631.000000 | 20631.000000 | 20631.000000 | 20631.0 | 2.063100e+04 | 20631.000000 | 20631.000000 | 20631.000000 | 2.063100e+04 | ... | 20631.000000 | 20631.000000 | 20631.000000 | 20631.000000 | 2.063100e+04 | 20631.000000 | 20631.0 | 20631.0 | 20631.000000 | 20631.000000 |
| mean | 51.506568 | 108.807862 | -0.000009 | 0.000002 | 100.0 | 5.186700e+02 | 642.680934 | 1590.523119 | 1408.933782 | 1.462000e+01 | ... | 521.413470 | 2388.096152 | 8143.752722 | 8.442146 | 3.000000e-02 | 393.210654 | 2388.0 | 100.0 | 38.816271 | 23.289705 |
| std | 29.227633 | 68.880990 | 0.002187 | 0.000293 | 0.0 | 6.537152e-11 | 0.500053 | 6.131150 | 9.000605 | 3.394700e-12 | ... | 0.737553 | 0.071919 | 19.076176 | 0.037505 | 1.556432e-14 | 1.548763 | 0.0 | 0.0 | 0.180746 | 0.108251 |
| min | 1.000000 | 1.000000 | -0.008700 | -0.000600 | 100.0 | 5.186700e+02 | 641.210000 | 1571.040000 | 1382.250000 | 1.462000e+01 | ... | 518.690000 | 2387.880000 | 8099.940000 | 8.324900 | 3.000000e-02 | 388.000000 | 2388.0 | 100.0 | 38.140000 | 22.894200 |
| 25% | 26.000000 | 52.000000 | -0.001500 | -0.000200 | 100.0 | 5.186700e+02 | 642.325000 | 1586.260000 | 1402.360000 | 1.462000e+01 | ... | 520.960000 | 2388.040000 | 8133.245000 | 8.414900 | 3.000000e-02 | 392.000000 | 2388.0 | 100.0 | 38.700000 | 23.221800 |
| 50% | 52.000000 | 104.000000 | 0.000000 | 0.000000 | 100.0 | 5.186700e+02 | 642.640000 | 1590.100000 | 1408.040000 | 1.462000e+01 | ... | 521.480000 | 2388.090000 | 8140.540000 | 8.438900 | 3.000000e-02 | 393.000000 | 2388.0 | 100.0 | 38.830000 | 23.297900 |
| 75% | 77.000000 | 156.000000 | 0.001500 | 0.000300 | 100.0 | 5.186700e+02 | 643.000000 | 1594.380000 | 1414.555000 | 1.462000e+01 | ... | 521.950000 | 2388.140000 | 8148.310000 | 8.465600 | 3.000000e-02 | 394.000000 | 2388.0 | 100.0 | 38.950000 | 23.366800 |
| max | 100.000000 | 362.000000 | 0.008700 | 0.000600 | 100.0 | 5.186700e+02 | 644.530000 | 1616.910000 | 1441.490000 | 1.462000e+01 | ... | 523.380000 | 2388.560000 | 8293.720000 | 8.584800 | 3.000000e-02 | 400.000000 | 2388.0 | 100.0 | 39.430000 | 23.618400 |
8 rows × 26 columns
# Dropping cols where the min and max are same. i.e. No information is gained
train = p.drop_no_info_cols(train)
# Creating RUL column
train = p.create_rul_col(train)
p.update_col_groups(train)
| unit | cycles | rul | ops1 | ops2 | s2 | s3 | s4 | s6 | s7 | s8 | s9 | s11 | s12 | s13 | s14 | s15 | s17 | s20 | s21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 191 | -0.0007 | -0.0004 | 641.82 | 1589.70 | 1400.60 | 21.61 | 554.36 | 2388.06 | 9046.19 | 47.47 | 521.66 | 2388.02 | 8138.62 | 8.4195 | 392 | 39.06 | 23.4190 |
| 1 | 1 | 2 | 190 | 0.0019 | -0.0003 | 642.15 | 1591.82 | 1403.14 | 21.61 | 553.75 | 2388.04 | 9044.07 | 47.49 | 522.28 | 2388.07 | 8131.49 | 8.4318 | 392 | 39.00 | 23.4236 |
| 2 | 1 | 3 | 189 | -0.0043 | 0.0003 | 642.35 | 1587.99 | 1404.20 | 21.61 | 554.26 | 2388.08 | 9052.94 | 47.27 | 522.42 | 2388.03 | 8133.23 | 8.4178 | 390 | 38.95 | 23.3442 |
| 3 | 1 | 4 | 188 | 0.0007 | 0.0000 | 642.35 | 1582.79 | 1401.87 | 21.61 | 554.45 | 2388.11 | 9049.48 | 47.13 | 522.86 | 2388.08 | 8133.83 | 8.3682 | 392 | 38.88 | 23.3739 |
| 4 | 1 | 5 | 187 | -0.0019 | -0.0002 | 642.37 | 1582.85 | 1406.22 | 21.61 | 554.00 | 2388.06 | 9055.15 | 47.28 | 522.19 | 2388.04 | 8133.80 | 8.4294 | 393 | 38.90 | 23.4044 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 20626 | 100 | 196 | 4 | -0.0004 | -0.0003 | 643.49 | 1597.98 | 1428.63 | 21.61 | 551.43 | 2388.19 | 9065.52 | 48.07 | 519.49 | 2388.26 | 8137.60 | 8.4956 | 397 | 38.49 | 22.9735 |
| 20627 | 100 | 197 | 3 | -0.0016 | -0.0005 | 643.54 | 1604.50 | 1433.58 | 21.61 | 550.86 | 2388.23 | 9065.11 | 48.04 | 519.68 | 2388.22 | 8136.50 | 8.5139 | 395 | 38.30 | 23.1594 |
| 20628 | 100 | 198 | 2 | 0.0004 | 0.0000 | 643.42 | 1602.46 | 1428.18 | 21.61 | 550.94 | 2388.24 | 9065.90 | 48.09 | 520.01 | 2388.24 | 8141.05 | 8.5646 | 398 | 38.44 | 22.9333 |
| 20629 | 100 | 199 | 1 | -0.0011 | 0.0003 | 643.23 | 1605.26 | 1426.53 | 21.61 | 550.68 | 2388.25 | 9073.72 | 48.39 | 519.67 | 2388.23 | 8139.29 | 8.5389 | 395 | 38.29 | 23.0640 |
| 20630 | 100 | 200 | 0 | -0.0032 | -0.0005 | 643.85 | 1600.38 | 1432.14 | 21.61 | 550.79 | 2388.26 | 9061.48 | 48.20 | 519.30 | 2388.26 | 8137.33 | 8.5036 | 396 | 38.37 | 23.0522 |
20631 rows × 20 columns
fig_1 = c.plot_max_cycles_dist(train)
fig_1
Handling Skew¶
fig_2 = px.histogram(train['rul'], title="Histogram of RUL")
fig_2
We find that the the distribution is right skewed. This is distort out prediction as very few units have max cycles above 240. So us try clipping or log transform to account for the skew. Also it is better to underestimate RUL than to overestimate due to safety reasons.
fig_3 = px.histogram(np.log1p(train['rul']), title="Histogram of log(RUL)")
fig_3
Taking the log transform has pushed the weight of the graph to the right. This is not ideal as the model would learn to predict values in the extreme right and fail to predict the smaller values. Let us try clipping and undersampling.
fig_4 = px.histogram(train['rul'].clip(upper=250), title="Histogram of clipped RUL")
fig_4
Clipping¶
train['rul'] = train['rul'].clip(upper=250)
Undersampling¶
high_rul_threshold = 190
df = train.copy()
# Split data into high RUL and low RUL cases
df_high_rul = df[df['rul'] > high_rul_threshold]
df_low_rul = df[df['rul'] <= high_rul_threshold]
# Randomly undersample high RUL cases
df_high_rul_sampled = df_high_rul.sample(frac=0.1, random_state=42) # Keep only 10% of high RUL cases
# Combine balanced dataset
df_balanced = pd.concat([df_low_rul, df_high_rul_sampled])
fig_5 = px.histogram(df_balanced['rul'], title="Histogram of undersampled RUL")
fig_5
# Let us normalize the operating setting and sensor values for each unit
train = df_balanced
train_norm = p.normalize_predictors(train, cols=['cycles']+p.get_op_setting_cols()+p.get_sensor_cols())
Grouped analysis¶
# Assuming all the sensor variation is similar in all units, let us group by cycles and take mean
grouped_train = p.group_by_rul(train, cols=(p.op_setting_cols+p.sensor_cols))
grouped_train
| ops1 | ops2 | s2 | s3 | s4 | s6 | s7 | s8 | s9 | s11 | s12 | s13 | s14 | s15 | s17 | s20 | s21 | unit | cycles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| rul | |||||||||||||||||||
| 0 | 0.000044 | -0.000050 | 643.721800 | 1602.499000 | 1430.873200 | 21.610000 | 551.361700 | 2388.246300 | 9099.359600 | 48.179800 | 519.671500 | 2388.251800 | 8168.124500 | 8.524108 | 396.600000 | 38.402200 | 23.053391 | 100 | 206.31000 |
| 1 | -0.000218 | -0.000005 | 643.714700 | 1602.184500 | 1429.019800 | 21.610000 | 551.427200 | 2388.239600 | 9099.609300 | 48.170200 | 519.790400 | 2388.239500 | 8167.558400 | 8.525559 | 396.250000 | 38.440600 | 23.064854 | 100 | 205.31000 |
| 2 | -0.000173 | 0.000035 | 643.683700 | 1602.212900 | 1428.166200 | 21.610000 | 551.498500 | 2388.233000 | 9098.004700 | 48.149500 | 519.773700 | 2388.232200 | 8166.927900 | 8.519881 | 396.160000 | 38.460600 | 23.074655 | 100 | 204.31000 |
| 3 | 0.000053 | -0.000035 | 643.627500 | 1601.869400 | 1429.034000 | 21.610000 | 551.573600 | 2388.226000 | 9097.151900 | 48.105600 | 519.866100 | 2388.225200 | 8166.301200 | 8.517111 | 396.100000 | 38.449700 | 23.072819 | 100 | 203.31000 |
| 4 | -0.000176 | -0.000008 | 643.602300 | 1602.679500 | 1427.540600 | 21.610000 | 551.613200 | 2388.215700 | 9095.834400 | 48.105900 | 519.944200 | 2388.218800 | 8165.420400 | 8.515365 | 396.090000 | 38.476900 | 23.084228 | 100 | 202.31000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 245 | -0.000167 | 0.000233 | 642.393333 | 1584.106667 | 1400.446667 | 21.610000 | 553.636667 | 2388.096667 | 9057.453333 | 47.376667 | 521.800000 | 2388.060000 | 8137.600000 | 8.430433 | 393.000000 | 38.943333 | 23.330900 | 3 | 23.00000 |
| 246 | 0.001900 | 0.000200 | 641.890000 | 1584.450000 | 1400.740000 | 21.610000 | 554.380000 | 2388.020000 | 9055.430000 | 47.040000 | 522.280000 | 2388.020000 | 8140.020000 | 8.414000 | 392.000000 | 39.110000 | 23.438100 | 1 | 10.00000 |
| 247 | -0.000700 | 0.000250 | 641.950000 | 1581.025000 | 1401.985000 | 21.610000 | 554.370000 | 2387.985000 | 9060.490000 | 47.165000 | 522.135000 | 2388.005000 | 8142.850000 | 8.379500 | 392.500000 | 38.960000 | 23.457600 | 2 | 49.00000 |
| 249 | -0.000100 | 0.000100 | 642.805000 | 1589.125000 | 1408.995000 | 21.610000 | 554.325000 | 2388.060000 | 9063.030000 | 47.410000 | 521.970000 | 2388.050000 | 8137.985000 | 8.426600 | 392.500000 | 38.855000 | 23.426300 | 2 | 73.50000 |
| 250 | 0.000090 | -0.000043 | 642.368571 | 1586.181905 | 1402.810317 | 21.609365 | 553.998889 | 2388.047937 | 9059.250000 | 47.321587 | 522.036032 | 2388.050476 | 8140.513016 | 8.414340 | 392.015873 | 38.932540 | 23.377263 | 63 | 37.52381 |
248 rows × 19 columns
# Plotting average of sensor values
fig_6 = c.plot_avg_sensor_data(grouped_train, p.sensor_cols)
fig_6
# Plotting sensor values for a specific unit
specific_unit = 5
train_specific_unit = train[train['unit'] == specific_unit]
fig_7 = px.line(train_specific_unit, x='rul', y=p.sensor_cols, title=f'Sensor values vs RUL for unit {specific_unit}')
fig_7
We observe that the relationship between the features is linear, so we will not go ahead with PCA.
# Observing standard deviation of sensor values
grouped_train_by_unit = train.groupby('unit')
grouped_train_by_unit.std()
| cycles | rul | ops1 | ops2 | s2 | s3 | s4 | s6 | s7 | s8 | s9 | s11 | s12 | s13 | s14 | s15 | s17 | s20 | s21 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| unit | |||||||||||||||||||
| 1 | 55.281100 | 55.281100 | 0.001958 | 0.000281 | 0.484584 | 5.774892 | 8.574439 | 0.000000 | 0.910347 | 0.071006 | 4.922447 | 0.269032 | 0.751004 | 0.076879 | 5.532885 | 0.034192 | 1.478234 | 0.166671 | 0.105058 |
| 2 | 58.188281 | 58.188281 | 0.002409 | 0.000291 | 0.542825 | 6.565985 | 9.697729 | 0.001727 | 0.953939 | 0.063663 | 12.687025 | 0.293618 | 0.798770 | 0.060863 | 9.054435 | 0.041285 | 1.696242 | 0.199844 | 0.114641 |
| 3 | 51.816986 | 51.816986 | 0.002145 | 0.000290 | 0.474190 | 6.386222 | 8.610126 | 0.001482 | 0.733983 | 0.039957 | 35.196825 | 0.242410 | 0.594788 | 0.041219 | 29.439715 | 0.036620 | 1.570773 | 0.166112 | 0.104734 |
| 4 | 54.703748 | 54.703748 | 0.002033 | 0.000286 | 0.441446 | 6.109152 | 7.737463 | 0.000727 | 0.708327 | 0.032530 | 36.571690 | 0.222724 | 0.530158 | 0.034819 | 30.691780 | 0.031947 | 1.602848 | 0.159302 | 0.093804 |
| 5 | 57.842838 | 57.788207 | 0.002133 | 0.000274 | 0.561806 | 7.122926 | 9.615056 | 0.001418 | 0.903592 | 0.056193 | 22.042360 | 0.283126 | 0.763686 | 0.057025 | 17.125721 | 0.039838 | 1.751698 | 0.186312 | 0.112837 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 96 | 68.396948 | 64.470376 | 0.002182 | 0.000278 | 0.503308 | 5.529613 | 8.543837 | 0.001207 | 0.851344 | 0.061336 | 6.826768 | 0.254158 | 0.712911 | 0.060816 | 3.955502 | 0.036692 | 1.524709 | 0.177857 | 0.105736 |
| 97 | 55.654487 | 55.654487 | 0.002096 | 0.000288 | 0.474201 | 6.161177 | 7.771640 | 0.001018 | 0.683301 | 0.040848 | 40.901828 | 0.212708 | 0.531868 | 0.036409 | 34.220602 | 0.034542 | 1.586373 | 0.173743 | 0.094439 |
| 98 | 45.177428 | 45.177428 | 0.002144 | 0.000275 | 0.525905 | 6.312261 | 9.751562 | 0.001378 | 0.898644 | 0.078068 | 5.040509 | 0.297087 | 0.774711 | 0.074219 | 5.859512 | 0.035768 | 1.407692 | 0.177168 | 0.107449 |
| 99 | 53.549043 | 53.549043 | 0.002138 | 0.000274 | 0.502209 | 5.717641 | 8.832273 | 0.001037 | 0.887134 | 0.072693 | 3.943513 | 0.273733 | 0.762417 | 0.070630 | 4.163966 | 0.037850 | 1.476351 | 0.167803 | 0.106405 |
| 100 | 55.281100 | 55.281100 | 0.002150 | 0.000300 | 0.460400 | 5.431068 | 8.155693 | 0.000724 | 0.838083 | 0.059683 | 4.610475 | 0.249864 | 0.666146 | 0.061507 | 3.244749 | 0.034984 | 1.408806 | 0.183127 | 0.102241 |
100 rows × 19 columns
# Plotting corrrelation values
fig_8 = c.plot_corr_heatmap(grouped_train)
fig_8
<module 'matplotlib.pyplot' from 'c:\\Users\\balaj\\anaconda3\\envs\\predictive_manintenance_project\\Lib\\site-packages\\matplotlib\\pyplot.py'>
We see that the most sensor values show strong postive or strong negative correlation with the RUL column.
MinMax Scaling¶
# Let us normalize the operating setting and sensor values for each unit
train_norm = p.normalize_predictors(train, [col for col in train.columns if col not in ['unit','rul']])
if training == True:
with open('./saved_scalers/minmaxscaler.pkl', 'wb') as file:
pickle.dump(p.get_minmaxscaler(), file)
PCA¶
# Standardizing data
train_std = p.standardize_predictors(train, [col for col in train.columns if col not in ['unit','rul']])
if training == True:
with open('./saved_scalers/stdscaler.pkl', 'wb') as file:
pickle.dump(p.get_stdscaler(), file)
# Performing PCA for dimensionality reduction
predicting_cols = ['cycles'] + p.sensor_cols + p.op_setting_cols
pca = PCA()
pca.fit(train_std[predicting_cols])
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)
print(cumulative_variance) # 99% of the variance is explained by the first component
[0.52346332 0.64591475 0.70225799 0.75715798 0.81113599 0.83710345 0.86008292 0.88077593 0.90022406 0.91750375 0.93366846 0.94795772 0.95959637 0.97060553 0.9805554 0.99023362 0.99851438 1. ]
fig_9 = px.line(x=range(1, len(cumulative_variance)+1), y=cumulative_variance*100, title="PCA components vs Variance")
fig_9.update_traces(mode='lines+markers')
fig_9.update_layout(
xaxis_title="Components",
yaxis_title="Variance"
)
fig_9
Interestingly performing PCA yielded worse performance. This is probably due to important degaradation signals being lost.
Data Splitting and Storing¶
# Split the normalized data into X and y
X = train_norm.drop(columns=['rul', 'unit'])
y = train_norm['rul']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)
# Saving Processed data
dataframe_y = pd.DataFrame(y)
dataframe_X = pd.DataFrame(X)
dataframe_y.to_parquet("../Data/Processed/y.parquet", engine="pyarrow", index=False)
dataframe_X.to_parquet("../Data/Processed/X.parquet", engine="pyarrow", index=False)
Model Training¶
Random Forest¶
Base Model with default hyperparameters¶
%%time
# Create the Random Forest model
rf_model = RandomForestRegressor()
# Train the model
rf_model.fit(X_train, y_train)
# Make predictions
y_pred = rf_model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')
Root Mean Squared Error: 25.67995823099634 R-squared: 0.7812235081313054 CPU times: total: 16.7 s Wall time: 16.2 s
# Saving the Random Forest model and parameters
if training == True:
with open('./saved_models/Random Forest/vanilla_rf_model.pkl', 'wb') as f:
pickle.dump(rf_model, f)
with open('./saved_models/Random Forest/vanilla_rf_hyperparameters.yaml', 'w') as f:
yaml.dump(rf_model.get_params(), f)
Hyperparameters tuning with GridSearchCV¶
%%time
if training == True:
# Create the Random Forest model
rf_model = RandomForestRegressor()
# Define the parameter grid
param_grid = { # The param grid has been reduced after running the grid search multiple times during training
'n_estimators': [100, 200],
'max_depth': [10, 12],
'min_samples_split': [10, 15],
'min_samples_leaf': [14, 16],
'bootstrap': [True]
}
# Create the GridSearchCV object
grid_search = GridSearchCV(
estimator=rf_model,
param_grid=param_grid,
cv=5,
scoring='neg_mean_squared_error',
verbose=1,
n_jobs=-1
)
# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)
# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')
# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)
else:
# Load the model from the pickle file
with open('./saved_models/Random Forest/rf_model.pkl', 'rb') as file:
best_rf_model = pickle.load(file)
# Make predictions
y_pred = best_rf_model.predict(X_test)
# Evaluate the model on validation set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'bootstrap': True, 'max_depth': 12, 'min_samples_leaf': 14, 'min_samples_split': 10, 'n_estimators': 200}
Best Score: -654.4040213951187
Root Mean Squared Error: 25.531830365103335
R-squared: 0.7837401344724261
CPU times: total: 35.9 s
Wall time: 1min 54s
feature_importances_df = pd.DataFrame({'feature':list(X_train.columns),'feature_imp':list(best_rf_model.feature_importances_)})
fig_10 = px.bar(feature_importances_df, x='feature', y='feature_imp', title='Feature Importances of Random Forest Model')
fig_10
We observe that the columns: cycles and sensor 11 are the most important features in predicting the RUL.
comparing_pred_df = pd.DataFrame(columns=['y_ground', 'y_rf', 'y_xgb'])
comparing_pred_df['y_ground'] = y_test
comparing_pred_df['y_rf'] = y_pred
# Saving the Random Forest model and parameters
if training == True:
with open('./saved_models/Random Forest/rf_model.pkl', 'wb') as f:
pickle.dump(best_rf_model, f)
with open('./saved_models/Random Forest/rf_hyperparameters.yaml', 'w') as f:
yaml.dump(best_rf_model.get_params(), f)
XGBoost¶
%%time
if training==True:
# Create the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')
# Define the parameter grid
param_grid = {
'n_estimators': [25, 50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.7, 0.8, 1.0],
'colsample_bytree': [0.7, 0.8, 1.0]
}
# Create the GridSearchCV object
grid_search = GridSearchCV(
estimator=xgb_model,
param_grid=param_grid,
cv=5,
scoring='neg_mean_squared_error',
verbose=1,
n_jobs=-1
)
# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)
# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best Parameters: {best_params}')
print(f'Best Score: {best_score}')
# Train the model with the best parameters
best_xgb_model = grid_search.best_estimator_
best_xgb_model.fit(X_train, y_train)
# Predictions
y_pred = best_xgb_model.predict(X_test)
# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')
Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Best Score: -655.3236572265625
Root Mean Squared Error: 25.290101490473255
R-squared: 0.7878157496452332
CPU times: total: 31.9 s
Wall time: 1min 21s
# Saving the xgboost model and parameters
if training == True:
with open('./saved_models/XGBoost/xgb_model.pkl', 'wb') as f:
pickle.dump(best_xgb_model, f)
with open('./saved_models/XGBoost/xgb_hyperparameters.yaml', 'w') as f:
yaml.dump(best_xgb_model.get_params(), f)
comparing_pred_df['y_xgb'] = y_pred
comparing_pred_df
| y_ground | y_rf | y_xgb | |
|---|---|---|---|
| 11366 | 25 | 57.766451 | 56.432632 |
| 3832 | 101 | 128.643630 | 125.958885 |
| 11981 | 145 | 153.792240 | 166.823669 |
| 18828 | 27 | 23.163588 | 28.785709 |
| 2244 | 131 | 148.630396 | 144.617905 |
| ... | ... | ... | ... |
| 6543 | 68 | 104.794687 | 106.140289 |
| 16110 | 27 | 26.255231 | 24.903612 |
| 14171 | 166 | 151.065582 | 144.633667 |
| 4213 | 149 | 146.732738 | 141.506668 |
| 14999 | 159 | 122.844246 | 128.025177 |
3677 rows × 3 columns